import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
df=pd.read_csv("vehicle.csv")
df.head()
df.shape
df.info()
df.describe().transpose()
df_copy=df.copy()
df_copy.shape
# We can see that there are 'NaN' values in few columns.
df_copy.isna().sum()
# attribute that contains Zero values.
print((df_copy == 0).sum())
"""
<b>Now skewness_about,skewness_about.1 contains 77 and 30 Zero(0) values respectively.Earlier though was to replace zeros but
i think these are not errors.so will keep them as it.<b>
"""
print((df_copy==0).sum())
df_copy.info()
# Now we will check total NaN values in all columns
df_copy.isna().sum()
"""
We can see we have columns with 'NaN' values.We will replace them with mean/median accordingly.
circularity 5
distance_circularity 4
radius_ratio 6
pr.axis_aspect_ratio 2
scatter_ratio 1
elongatedness 1
pr.axis_rectangularity 3
scaled_variance 3
scaled_variance.1 2
scaled_radius_of_gyration 2
scaled_radius_of_gyration.1 4
skewness_about 6
skewness_about.1 1
skewness_about.2 1
"""
"""
I can't plot distplot as columns contains NaN values.So i will remove NaN and
then check the data is normally distributed or not
"""
# df_copy_circularity=df_copy['circularity'].dropna(axis=0)
# df_copy_distance_circularity= df_copy['distance_circularity'].dropna(axis=0)
# df_copy_radius_ratio= df_copy['radius_ratio'].dropna(axis=0)
# df_copy_praxis_aspect_ratio= df_copy['pr.axis_aspect_ratio'].dropna(axis=0)
# df_copy_scatter_ratio= df_copy['scatter_ratio'].dropna(axis=0)
# df_copy_elongatedness= df_copy['elongatedness'].dropna(axis=0)
# df_copy_praxis_rectangularity= df_copy['pr.axis_rectangularity'].dropna(axis=0)
# df_copy_scaled_variance= df_copy['scaled_variance'].dropna(axis=0)
# df_copy_scaled_variance1= df_copy['scaled_variance.1'].dropna(axis=0)
# df_copy_scaled_radius_of_gyration= df_copy['scaled_radius_of_gyration'].dropna(axis=0)
# df_copy_scaled_radius_of_gyration1= df_copy['scaled_radius_of_gyration.1'].dropna(axis=0)
# df_copy_skewness_about= df_copy['skewness_about'].dropna(axis=0)
# df_copy_skewness_about1= df_copy['skewness_about.1'].dropna(axis=0)
# df_copy_skewness_about2= df_copy['skewness_about.2'].dropna(axis=0)
# sns.distplot(df_copy_circularity)
# skewness_about=df_copy['skewness_about'].dropna()
# skewness_about.count()
# Class is categorical.will convert it to interger and then replace NaN.
df_copy['class']=df_copy['class'].astype('category').cat.codes
df_copy['class'].head()
# Missing value changed by using Impurter or by using fillNa method.
# imputer=Imputer(missing_values='NaN',strategy='median',axis=0)
# transformed_X=imputer.fit_transform(df_copy)
# transformed_X.dtype
df_copy.isnull().sum()
# Missing value changed by using Impurter or by using fillNa method.
# df_copy_With_No_NA=df_copy.fillna(df_copy.median,inplace=True)
# df_copy_With_No_NA
df_copy_With_No_NAN_Values=df_copy.fillna(df_copy.median())
df_copy_With_No_NAN_Values.isnull().sum()
Missing values are handeled by replacing with Median in the datasets.
df_copy_With_No_NAN_Values.boxplot(figsize=(100,10))
"""
We have outliers in radius_ratio,pr.axis_aspect_ratio,
max.length_aspect_ratio,scaled_variance,scaled_radius_of_gyration.1,skewness_about,skewness_about1
"""
Q1 = df_copy_With_No_NAN_Values.quantile(0.25)
Q3 = df_copy_With_No_NAN_Values.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
df_copy_With_No_NAN_Values = df_copy_With_No_NAN_Values[~((df_copy_With_No_NAN_Values < (Q1 - 1.5 * IQR)) |(df_copy_With_No_NAN_Values > (Q3 + 1.5 * IQR))).any(axis=1)]
df_copy_With_No_NAN_Values.shape
"""
We can see in boxplot,outliers are removed
"""
df_copy_With_No_NAN_Values.boxplot(figsize=(100,10))
df_copy_With_No_NAN_Values_Corr=df_copy_With_No_NAN_Values.corr(method='pearson')
df_copy_With_No_NAN_Values_Corr
# plot the heatmap
sns.heatmap(df_copy_With_No_NAN_Values_Corr,
xticklabels=df_copy_With_No_NAN_Values_Corr.columns,
yticklabels=df_copy_With_No_NAN_Values_Corr.columns)
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)
def magnify():
return [dict(selector="th",
props=[("font-size", "7pt")]),
dict(selector="td",
props=[('padding', "0em 0em")]),
dict(selector="th:hover",
props=[("font-size", "12pt")]),
dict(selector="tr:hover td:hover",
props=[('max-width', '200px'),
('font-size', '12pt')])
]
df_copy_With_No_NAN_Values_Corr.style.background_gradient(cmap, axis=1)\
.set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
.set_caption("Hover to magify")\
.set_precision(2)\
.set_table_styles(magnify())
# Find inferences
"""
Compactness shows high significance with circularity,distance_circularity,radius_ratio,
scatter_ratio,elongatedness in a negative way,pr.axis_rectangularity,max.length_rectangularity,
scaled_variance,scaled_variance.1
"""
#Let us check for pair plots
sns.pairplot(df_copy_With_No_NAN_Values,diag_kind='kde')
# In this class is a dependent variable.So removing it for PCA.
X=df_copy_With_No_NAN_Values.drop(['class'],axis=1)
X.head()
y=df_copy_With_No_NAN_Values[['class']]
sns.pairplot(X,diag_kind='kde')
"""
We will scale the data with the help of zscore.It is used to remove the influence of one attribute over others in terms of
units.
"""
from scipy.stats import zscore
XScaled=X.apply(zscore)
XScaled.head()
# Create covariance matrix
cov_matrix = np.cov(XScaled,rowvar=False)
print('Covariance Matrix \n%s', cov_matrix)
# Step 2- Get eigen values and eigen vector
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eig_vecs)
print('\n Eigen Values \n%s', eig_vals)
tot = sum(eig_vals)
var_exp = [( i /tot ) * 100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_exp)
# Ploting
plt.figure(figsize=(10 , 5))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
"""
Visually we can observe that their is steep drop in variance explained with increase in number of PC's.
We will proceed with 7 components here.
"""
# NOTE - we are generating only 7 PCA dimensions (dimensionality reduction from 18 to 7)
from sklearn.decomposition import PCA
pca7 = PCA(n_components=7)
pca7.fit(XScaled)
print(pca7.components_)
print(pca7.explained_variance_ratio_)
Xpca7=pca7.transform(XScaled)
"""
We can see these are independent and not corelated.So we have reduced dimensions
and covered almost 95 % of variation or information as rest are noise.
"""
sns.pairplot(pd.DataFrame(Xpca7))
from sklearn import svm
XScaled.shape
y.head()
# Split the Data.Considering all variables.
X_train,X_test,y_train,y_test=train_test_split(XScaled,y,test_size=.30, random_state=1)
# Gamma is a measure of influence of a data point.C is complexity of the model,lower C value creates simple hypersurface.
clf=svm.SVC(gamma=0.025,C=3)
clf
clf.fit(X_train,y_train)
y_predSVM=clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_predSVM)*100
print(('Accuracy of SVM model with all variables is equal '+ str(round(accuracy, 2))) + ' %.')
# MAking confusion matrix
from sklearn.metrics import confusion_matrix
cmSVMAllVar=confusion_matrix(y_test,y_predSVM)
cmSVMAllVar
Xpca7.shape
# Split the data
X_train7Var,X_test7Var,y_train7Var,y_test7Var=train_test_split(Xpca7,y,test_size=.30,random_state=1)
clf7Var=svm.SVC(gamma=0.25,C=3)
clf7Var
clf7Var.fit(X_train7Var,y_train7Var)
y_PredSVM7Var=clf7Var.predict(X_test7Var)
y_PredSVM7Var
from sklearn.metrics import accuracy_score
accuracy7Var=accuracy_score(y_test7Var,y_PredSVM7Var)*100
print(('Accuracy of SVM model with 7 variables is equal '+ str(round(accuracy7Var, 2))) + ' %.')
# MAking confusion matrix
from sklearn.metrics import confusion_matrix
cmSVM7Var=confusion_matrix(y_test7Var,y_PredSVM7Var)
cmSVM7Var
print(('Accuracy of SVM model with all variables is equal '+ str(round(accuracy, 2))) + ' %.')
print(('Accuracy of SVM model with 7 variables is equal '+ str(round(accuracy7Var, 2))) + ' %.')
from sklearn.naive_bayes import GaussianNB
classifier=GaussianNB()
# With all variables
classifier.fit(X_train,y_train)
y_predNaiveAllVar=classifier.predict(X_test)
y_predNaiveAllVar
cmNaiveAllVar=confusion_matrix(y_test,y_predNaiveAllVar)
cmNaiveAllVar
accuracyNaiveBayesAllVar=accuracy_score(y_test,y_predNaiveAllVar)*100
print('Accuracy of Naive Bayes for all Variables is equal ' + str(round(accuracy, 2)) + ' %.')
# With 7 variables
classifier.fit(X_train7Var,y_train7Var)
y_predNaive7Var=classifier.predict(X_test7Var)
y_predNaive7Var
cmNaive7Var=confusion_matrix(y_test7Var,y_predNaive7Var)
cmNaive7Var
accuracyNaiveBayes7Var=accuracy_score(y_test7Var,y_predNaive7Var)*100
print('Accuracy of Naive Bayes for 7 Variables is equal ' + str(round(accuracy, 2)) + ' %.')
print('Accuracy of Naive Bayes for 7 Variables is equal ' + str(round(accuracy, 2)) + ' %.')
print('Accuracy of Naive Bayes for all Variables is equal ' + str(round(accuracy, 2)) + ' %.')
# Hyper Parameter
from sklearn.metrics import classification_report
from sklearn.svm import SVC
print(classification_report(y_test, y_predSVM))
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
# fitting the model for grid search
grid.fit(X_train, y_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test)
# print classification report
print(classification_report(y_test, grid_predictions))